{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# How to apply tabular metrics and tests to text descriptors?"
      ],
      "metadata": {
        "id": "29489ded"
      },
      "id": "29489ded"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "from sklearn import datasets, ensemble, model_selection"
      ],
      "outputs": [],
      "metadata": {
        "id": "997d80b6"
      },
      "id": "997d80b6"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "from evidently import ColumnMapping\n",
        "from evidently.report import Report\n",
        "from evidently.test_suite import TestSuite\n",
        "\n",
        "from evidently.metrics import ColumnSummaryMetric\n",
        "from evidently.metrics import ColumnCorrelationsMetric\n",
        "from evidently.metrics import ColumnDistributionMetric\n",
        "from evidently.metrics import ColumnDriftMetric\n",
        "from evidently.metrics import ColumnValueRangeMetric\n",
        "\n",
        "from evidently.metrics import DatasetCorrelationsMetric\n",
        "from evidently.metrics import DatasetDriftMetric\n",
        "from evidently.metrics import DataDriftTable\n",
        "from evidently.metrics import ClassificationQualityByFeatureTable\n",
        "from evidently.metrics import RegressionErrorBiasTable\n",
        "\n",
        "from evidently.tests import TestColumnDrift\n",
        "from evidently.tests import TestValueRange\n",
        "from evidently.tests import TestNumberOfOutRangeValues\n",
        "from evidently.tests import TestShareOfOutRangeValues\n",
        "from evidently.tests import TestMeanInNSigmas\n",
        "from evidently.tests import TestColumnValueMin\n",
        "from evidently.tests import TestColumnValueMax\n",
        "from evidently.tests import TestColumnValueMean\n",
        "from evidently.tests import TestColumnValueMedian\n",
        "from evidently.tests import TestColumnValueStd\n",
        "from evidently.tests import TestColumnQuantile\n",
        "\n",
        "from evidently.tests import TestHighlyCorrelatedColumns\n",
        "from evidently.tests import TestTargetFeaturesCorrelations\n",
        "from evidently.tests import TestPredictionFeaturesCorrelations\n",
        "from evidently.tests import TestCorrelationChanges\n",
        "from evidently.tests import TestNumberOfDriftedColumns\n",
        "from evidently.tests import TestShareOfDriftedColumns\n",
        "\n",
        "from evidently.descriptors import TextLength, TriggerWordsPresence, OOV, NonLetterCharacterPercentage, SentenceCount, WordCount, Sentiment, RegExp"
      ],
      "outputs": [],
      "metadata": {
        "id": "dcacbf00"
      },
      "id": "dcacbf00"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "import nltk\n",
        "nltk.download('words')\n",
        "nltk.download('wordnet')\n",
        "nltk.download('omw-1.4')\n",
        "nltk.download('vader_lexicon')"
      ],
      "outputs": [],
      "metadata": {
        "id": "a9640360"
      },
      "id": "a9640360"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#Data with Texts\n",
        "reviews_data = datasets.fetch_openml(name='Womens-E-Commerce-Clothing-Reviews', version=2, as_frame='auto')\n",
        "reviews = reviews_data.frame"
      ],
      "outputs": [],
      "metadata": {
        "id": "2a490d0c"
      },
      "id": "2a490d0c"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "reviews['prediction'] = reviews['Rating']\n",
        "reviews_ref = reviews[reviews.Rating > 3].sample(n=5000, replace=True, ignore_index=True, random_state=42)\n",
        "reviews_cur = reviews[reviews.Rating < 3].sample(n=5000, replace=True, ignore_index=True, random_state=42)"
      ],
      "outputs": [],
      "metadata": {
        "id": "46341da3"
      },
      "id": "46341da3"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "column_mapping = ColumnMapping(\n",
        "    numerical_features=['Age', 'Positive_Feedback_Count'],\n",
        "    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],\n",
        "    text_features=['Review_Text', 'Title']\n",
        ")"
      ],
      "outputs": [],
      "metadata": {
        "id": "67001997"
      },
      "id": "67001997"
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Colum Metrics with Descriptors"
      ],
      "metadata": {
        "id": "37020ef3"
      },
      "id": "37020ef3"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "table_column_metrics_report = Report(metrics=[\n",
        "    ColumnSummaryMetric(column_name = RegExp(reg_exp=r'.*\\?.*', display_name=\"Questions\").for_column(\"Review_Text\")),\n",
        "    ColumnDriftMetric(column_name = SentenceCount(display_name=\"SentenceCount\").for_column(\"Review_Text\")),\n",
        "    ColumnCorrelationsMetric(column_name = WordCount(display_name=\"WordCount\").for_column(\"Review_Text\")),\n",
        "    ColumnDistributionMetric(column_name = Sentiment(display_name=\"Sentiment\").for_column(\"Review_Text\")),\n",
        "    ColumnValueRangeMetric(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\"), left=0, right=20)\n",
        "])\n",
        "\n",
        "table_column_metrics_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
        "table_column_metrics_report"
      ],
      "outputs": [],
      "metadata": {
        "scrolled": true,
        "id": "571dfac8"
      },
      "id": "571dfac8"
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Column Tests with Descriptors"
      ],
      "metadata": {
        "id": "53cd8b2a"
      },
      "id": "53cd8b2a"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "table_column_test_suite = TestSuite(tests=[\n",
        "    TestColumnDrift(column_name = RegExp(reg_exp=r'.*\\?.*', display_name=\"Questions\").for_column(\"Review_Text\")),\n",
        "    TestValueRange(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
        "    TestNumberOfOutRangeValues(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
        "    TestShareOfOutRangeValues(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
        "    TestMeanInNSigmas(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
        "    TestColumnValueMin(column_name = SentenceCount(display_name=\"SentenceCount\").for_column(\"Review_Text\")),\n",
        "    TestColumnValueMax(column_name = WordCount(display_name=\"WordCount\").for_column(\"Review_Text\")),\n",
        "    TestColumnValueMean(column_name = Sentiment(display_name=\"Sentiment\").for_column(\"Review_Text\")),\n",
        "    TestColumnValueMedian(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
        "    TestColumnValueStd(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\")),\n",
        "    TestColumnQuantile(column_name = TextLength(display_name=\"TextLength\").for_column(\"Review_Text\"), quantile=0.25),\n",
        "])\n",
        "\n",
        "table_column_test_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
        "table_column_test_suite"
      ],
      "outputs": [],
      "metadata": {
        "id": "6d94c639"
      },
      "id": "6d94c639"
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Dataset Metrics with Descriptors\n",
        "Currently, only default descriptors configuration is available"
      ],
      "metadata": {
        "id": "7db95a30"
      },
      "id": "7db95a30"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#Dataset-level metrics which use raw text data\n",
        "\n",
        "table_column_metrics_report = Report(metrics=[\n",
        "    DatasetDriftMetric(columns=[\"Age\", \"Review_Text\"]),\n",
        "    DataDriftTable(columns=[\"Age\", \"Review_Text\"]),\n",
        "\n",
        "])\n",
        "\n",
        "table_column_metrics_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
        "table_column_metrics_report"
      ],
      "outputs": [],
      "metadata": {
        "id": "eb55abf8"
      },
      "id": "eb55abf8"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#Dataset-level metric which uses descriptors\n",
        "\n",
        "regression_report = Report(metrics=[\n",
        "    RegressionErrorBiasTable(columns=[\"Age\", \"Review_Text\"], descriptors = {\n",
        "        \"Review_Text\":{\n",
        "        \"Text Length\" : TextLength(),\n",
        "        \"Reviews about Dress\" : TriggerWordsPresence(words_list=['dress', 'gown']),\n",
        "        \"Review about Blouses\" : TriggerWordsPresence(words_list=['blouse', 'shirt'])\n",
        "\n",
        "    }\n",
        "    }),\n",
        "\n",
        "], options={\"render\": {\"raw_data\": True}})\n",
        "\n",
        "regression_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=ColumnMapping(\n",
        "    target='Rating',\n",
        "    prediction='prediction',\n",
        "    numerical_features=['Age', 'Positive_Feedback_Count'],\n",
        "    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],\n",
        "    text_features=['Review_Text', 'Title'],\n",
        "    task='regression'\n",
        "  )\n",
        ")\n",
        "\n",
        "regression_report"
      ],
      "outputs": [],
      "metadata": {
        "id": "6d9bc7b5"
      },
      "id": "6d9bc7b5"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#Dataset-level metric which uses descriptors\n",
        "\n",
        "classification_report = Report(metrics=[\n",
        "    ClassificationQualityByFeatureTable(columns=[\"Age\", \"Review_Text\"], descriptors = {\n",
        "        \"Review_Text\":{\n",
        "        \"Text Length\" : TextLength(),\n",
        "        \"Reviews about Dress\" : TriggerWordsPresence(words_list=['dress', 'gown']),\n",
        "        \"Review about Blouses\" : TriggerWordsPresence(words_list=['blouse', 'shirt']),\n",
        "        \"SentenceCount\": SentenceCount(),\n",
        "        \"WordCount\": WordCount(),\n",
        "        \"Sentiment\": Sentiment(),\n",
        "    }}),\n",
        "\n",
        "], options={\"render\": {\"raw_data\": True}})\n",
        "classification_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=ColumnMapping(\n",
        "    target='Rating',\n",
        "    prediction='prediction',\n",
        "    numerical_features=['Age', 'Positive_Feedback_Count'],\n",
        "    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],\n",
        "    text_features=['Review_Text', 'Title'],\n",
        "    task='classification'\n",
        "  )\n",
        ")\n",
        "\n",
        "classification_report"
      ],
      "outputs": [],
      "metadata": {
        "scrolled": false,
        "id": "8e81be0d"
      },
      "id": "8e81be0d"
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Dataset Tests with Descriptors"
      ],
      "metadata": {
        "id": "04f9daac"
      },
      "id": "04f9daac"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "source": [
        "#Dataset-level tests which use raw text data\n",
        "\n",
        "table_dataset_test_suite = TestSuite(tests=[\n",
        "    TestNumberOfDriftedColumns(columns=[\"Age\", \"Review_Text\"]),\n",
        "    TestShareOfDriftedColumns(columns=[\"Age\", \"Review_Text\"]),\n",
        "\n",
        "])\n",
        "\n",
        "table_dataset_test_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=ColumnMapping(\n",
        "    target='Rating',\n",
        "    prediction='prediction',\n",
        "    numerical_features=['Age', 'Positive_Feedback_Count'],\n",
        "    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],\n",
        "    text_features=['Review_Text', 'Title'],\n",
        "    task='regression'\n",
        "  ))\n",
        "table_dataset_test_suite"
      ],
      "outputs": [],
      "metadata": {
        "scrolled": true,
        "id": "495df999"
      },
      "id": "495df999"
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "eJUYC6TAESkO"
      },
      "id": "eJUYC6TAESkO",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.16"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}